library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.12.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(SNPfiltR)
## This is SNPfiltR v.1.0.0
##
## Detailed usage information is available at: devonderaad.github.io/SNPfiltR/
##
## If you use SNPfiltR in your published work, please cite the following papers:
##
## DeRaad, D.A. (2022), SNPfiltR: an R package for interactive and reproducible SNP filtering. Molecular Ecology Resources, 00, 1-15. http://doi.org/10.1111/1755-0998.13618
##
## Knaus, Brian J., and Niklaus J. Grunwald. 2017. VCFR: a package to manipulate and visualize variant call format data in R. Molecular Ecology Resources, 17.1:44-53. http://doi.org/10.1111/1755-0998.12549
library(ggplot2)
Filter input SNP files based on MAC to increase signal to noise
ratio
#bring in sample info
#read in sample info csv
sample.info<-read.csv("~/Desktop/todi.2022/todiramphus.subset.csv")
#filter input vcf by MAC in order to increase signal to noise ratio for ADMIXTURE analysis
#read in filtered vcf
vcfR <- read.vcfR("~/Desktop/todi.2022/todi.unlinked.85.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 14
## header_line: 15
## variant count: 1892
## column count: 92
##
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 1892
## Character matrix gt cols: 92
## skip: 0
## nrows: 1892
## row_num: 0
##
Processed variant 1000
Processed variant: 1892
## All variants processed
vcfR
## ***** Object of Class vcfR *****
## 83 samples
## 32 CHROMs
## 1,892 variants
## Object size: 12.1 Mb
## 8.083 percent missing data
## ***** ***** *****
#filter by MAC thresholds
vcf.2<-min_mac(vcfR, min.mac = 2)
## 29.39% of SNPs fell below a minor allele count of 2 and were removed from the VCF

vcf.2
## ***** Object of Class vcfR *****
## 83 samples
## 30 CHROMs
## 1,336 variants
## Object size: 8.7 Mb
## 7.996 percent missing data
## ***** ***** *****
#filter by MAC thresholds
vcf.5<-min_mac(vcfR, min.mac = 5)
## 59.99% of SNPs fell below a minor allele count of 5 and were removed from the VCF

vcf.5
## ***** Object of Class vcfR *****
## 83 samples
## 27 CHROMs
## 757 variants
## Object size: 5 Mb
## 8.211 percent missing data
## ***** ***** *****
#filter by MAC thresholds
vcf.10<-min_mac(vcfR, min.mac = 10)
## 77.43% of SNPs fell below a minor allele count of 10 and were removed from the VCF

vcf.10
## ***** Object of Class vcfR *****
## 83 samples
## 23 CHROMs
## 427 variants
## Object size: 2.9 Mb
## 8.135 percent missing data
## ***** ***** *****
#write out vcf file
vcfR::write.vcf(vcf.2, file="~/Downloads/todi.mac2.vcf.gz")
vcfR::write.vcf(vcf.5, file="~/Downloads/todi.mac5.vcf.gz")
vcfR::write.vcf(vcf.10, file="~/Downloads/todi.mac10.vcf.gz")
#try removing sanctus
colnames(vcfR@gt)
## [1] "FORMAT" "T_albicilla_22581" "T_albicilla_22591"
## [4] "T_albicilla_22592" "T_albicilla_22611" "T_albicilla_85104"
## [7] "T_albicilla_85105" "T_chloris_12584" "T_chloris_12606"
## [10] "T_chloris_13960" "T_chloris_14446" "T_chloris_20983"
## [13] "T_chloris_22075" "T_chloris_23253" "T_chloris_23630"
## [16] "T_chloris_23632" "T_chloris_24382" "T_chloris_24727"
## [19] "T_colonus_2003089" "T_colonus_2003097" "T_colonus_2004070"
## [22] "T_leucopygius_32019" "T_leucopygius_32037" "T_leucopygius_34505"
## [25] "T_leucopygius_34508" "T_leucopygius_DOT6654" "T_sanctus_14877"
## [28] "T_sanctus_14879" "T_sanctus_24565" "T_sanctus_25117"
## [31] "T_sanctus_33267" "T_sanctus_34636" "T_sanctus_34659"
## [34] "T_sanctus_35549" "T_sanctus_72545" "T_sanctus_7567"
## [37] "T_sanctus_B29435" "T_sanctus_B32637" "T_sanctus_B33280"
## [40] "T_sanctus_B34636" "T_sanctus_B34659" "T_sanctus_B34946"
## [43] "T_sanctus_B43266" "T_sanctus_B50292" "T_sanctus_B50543"
## [46] "T_sanctus_B51072" "T_sanctus_B53055" "T_sanctus_B53068"
## [49] "T_sanctus_B54673" "T_sanctus_B57402" "T_sanctus_B59372"
## [52] "T_sanctus_B60180" "T_sanctus_B60181" "T_sanctus_B60182"
## [55] "T_sanctus_B60183" "T_sanctus_B60184" "T_saurophagus_18929"
## [58] "T_saurophagus_27804" "T_saurophagus_36179" "T_saurophagus_36283"
## [61] "T_saurophagus_36284" "T_saurophagus_69666" "T_sordidus_B33717"
## [64] "T_sordidus_B33718" "T_sordidus_B33719" "T_sordidus_B33720"
## [67] "T_sordidus_B44198" "T_sordidus_B44295" "T_sordidus_B44296"
## [70] "T_sordidus_B51462" "T_tristrami_18953" "T_tristrami_27723"
## [73] "T_tristrami_27752" "T_tristrami_27792" "T_tristrami_27793"
## [76] "T_tristrami_32049" "T_tristrami_33253" "T_tristrami_33839"
## [79] "T_tristrami_33858" "T_tristrami_33864" "T_tristrami_33867"
## [82] "T_tristrami_33895" "T_tristrami_36188" "T_tristrami_6704"
vcf.2a<-vcfR[,c(1:26,57:77,83,84)]
colnames(vcf.2a@gt)
## [1] "FORMAT" "T_albicilla_22581" "T_albicilla_22591"
## [4] "T_albicilla_22592" "T_albicilla_22611" "T_albicilla_85104"
## [7] "T_albicilla_85105" "T_chloris_12584" "T_chloris_12606"
## [10] "T_chloris_13960" "T_chloris_14446" "T_chloris_20983"
## [13] "T_chloris_22075" "T_chloris_23253" "T_chloris_23630"
## [16] "T_chloris_23632" "T_chloris_24382" "T_chloris_24727"
## [19] "T_colonus_2003089" "T_colonus_2003097" "T_colonus_2004070"
## [22] "T_leucopygius_32019" "T_leucopygius_32037" "T_leucopygius_34505"
## [25] "T_leucopygius_34508" "T_leucopygius_DOT6654" "T_saurophagus_18929"
## [28] "T_saurophagus_27804" "T_saurophagus_36179" "T_saurophagus_36283"
## [31] "T_saurophagus_36284" "T_saurophagus_69666" "T_sordidus_B33717"
## [34] "T_sordidus_B33718" "T_sordidus_B33719" "T_sordidus_B33720"
## [37] "T_sordidus_B44198" "T_sordidus_B44295" "T_sordidus_B44296"
## [40] "T_sordidus_B51462" "T_tristrami_18953" "T_tristrami_27723"
## [43] "T_tristrami_27752" "T_tristrami_27792" "T_tristrami_27793"
## [46] "T_tristrami_32049" "T_tristrami_33253" "T_tristrami_36188"
## [49] "T_tristrami_6704"
#remove invariant snps and singletons
vcf.2a<-min_mac(vcf.2a, min.mac = 2)
## 57.08% of SNPs fell below a minor allele count of 2 and were removed from the VCF

vcfR::write.vcf(vcf.2a, file="~/Downloads/todi.nosanctus.vcf.gz")
#try downsampling sanctus
vcf.2b<-vcfR[,c(1:34,57:77,83,84)]
colnames(vcf.2b@gt)
## [1] "FORMAT" "T_albicilla_22581" "T_albicilla_22591"
## [4] "T_albicilla_22592" "T_albicilla_22611" "T_albicilla_85104"
## [7] "T_albicilla_85105" "T_chloris_12584" "T_chloris_12606"
## [10] "T_chloris_13960" "T_chloris_14446" "T_chloris_20983"
## [13] "T_chloris_22075" "T_chloris_23253" "T_chloris_23630"
## [16] "T_chloris_23632" "T_chloris_24382" "T_chloris_24727"
## [19] "T_colonus_2003089" "T_colonus_2003097" "T_colonus_2004070"
## [22] "T_leucopygius_32019" "T_leucopygius_32037" "T_leucopygius_34505"
## [25] "T_leucopygius_34508" "T_leucopygius_DOT6654" "T_sanctus_14877"
## [28] "T_sanctus_14879" "T_sanctus_24565" "T_sanctus_25117"
## [31] "T_sanctus_33267" "T_sanctus_34636" "T_sanctus_34659"
## [34] "T_sanctus_35549" "T_saurophagus_18929" "T_saurophagus_27804"
## [37] "T_saurophagus_36179" "T_saurophagus_36283" "T_saurophagus_36284"
## [40] "T_saurophagus_69666" "T_sordidus_B33717" "T_sordidus_B33718"
## [43] "T_sordidus_B33719" "T_sordidus_B33720" "T_sordidus_B44198"
## [46] "T_sordidus_B44295" "T_sordidus_B44296" "T_sordidus_B51462"
## [49] "T_tristrami_18953" "T_tristrami_27723" "T_tristrami_27752"
## [52] "T_tristrami_27792" "T_tristrami_27793" "T_tristrami_32049"
## [55] "T_tristrami_33253" "T_tristrami_36188" "T_tristrami_6704"
#remove invariant snps and singletons
vcf.2b<-min_mac(vcf.2b, min.mac = 2)
## 48.57% of SNPs fell below a minor allele count of 2 and were removed from the VCF

vcfR::write.vcf(vcf.2b, file="~/Downloads/todi.downsamplesanctus.vcf.gz")
Show K=10 comparisons (sample order is the same in all plots
throughout)
knitr::include_graphics("/Users/devder/Desktop/todi.2022/admixture/k10.comparisons.png")

Visualize ADMIXTURE results no MAC filter
#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture/")
#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")
#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ylab("cross-validation error")+
xlab("K")+
scale_x_continuous(breaks = c(1:10))+
theme_classic()

#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture/")
#read in input file in order to get list of input samples in order
samps<-read.table("binary_fileset.fam")[,1]
#reorder sampling df to match order of the plot
sample.info<-sample.info[match(samps, sample.info$id),]
sample.info$id == samps
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}
par(mfrow=c(5,1))
#plot each run
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

for (i in 6:10){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

Visualize ADMIXTURE results with MAC 2 filter
#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture.mac2/")
#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")
#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ylab("cross-validation error")+
xlab("K")+
scale_x_continuous(breaks = c(1:10))+
theme_classic()

#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture.mac2/")
#read in input file in order to get list of input samples in order
samps<-read.table("binary_fileset.fam")[,1]
#reorder sampling df to match order of the plot
sample.info<-sample.info[match(samps, sample.info$id),]
sample.info$id == samps
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sample.info$id
## [1] "T_albicilla_22581" "T_albicilla_22591" "T_albicilla_22592"
## [4] "T_albicilla_22611" "T_albicilla_85104" "T_albicilla_85105"
## [7] "T_chloris_12584" "T_chloris_12606" "T_chloris_13960"
## [10] "T_chloris_14446" "T_chloris_20983" "T_chloris_22075"
## [13] "T_chloris_23253" "T_chloris_23630" "T_chloris_23632"
## [16] "T_chloris_24382" "T_chloris_24727" "T_colonus_2003089"
## [19] "T_colonus_2003097" "T_colonus_2004070" "T_leucopygius_32019"
## [22] "T_leucopygius_32037" "T_leucopygius_34505" "T_leucopygius_34508"
## [25] "T_leucopygius_DOT6654" "T_sanctus_14877" "T_sanctus_14879"
## [28] "T_sanctus_24565" "T_sanctus_25117" "T_sanctus_33267"
## [31] "T_sanctus_34636" "T_sanctus_34659" "T_sanctus_35549"
## [34] "T_sanctus_72545" "T_sanctus_7567" "T_sanctus_B29435"
## [37] "T_sanctus_B32637" "T_sanctus_B33280" "T_sanctus_B34636"
## [40] "T_sanctus_B34659" "T_sanctus_B34946" "T_sanctus_B43266"
## [43] "T_sanctus_B50292" "T_sanctus_B50543" "T_sanctus_B51072"
## [46] "T_sanctus_B53055" "T_sanctus_B53068" "T_sanctus_B54673"
## [49] "T_sanctus_B57402" "T_sanctus_B59372" "T_sanctus_B60180"
## [52] "T_sanctus_B60181" "T_sanctus_B60182" "T_sanctus_B60183"
## [55] "T_sanctus_B60184" "T_saurophagus_18929" "T_saurophagus_27804"
## [58] "T_saurophagus_36179" "T_saurophagus_36283" "T_saurophagus_36284"
## [61] "T_saurophagus_69666" "T_sordidus_B33717" "T_sordidus_B33718"
## [64] "T_sordidus_B33719" "T_sordidus_B33720" "T_sordidus_B44198"
## [67] "T_sordidus_B44295" "T_sordidus_B44296" "T_sordidus_B51462"
## [70] "T_tristrami_18953" "T_tristrami_27723" "T_tristrami_27752"
## [73] "T_tristrami_27792" "T_tristrami_27793" "T_tristrami_32049"
## [76] "T_tristrami_33253" "T_tristrami_33839" "T_tristrami_33858"
## [79] "T_tristrami_33864" "T_tristrami_33867" "T_tristrami_33895"
## [82] "T_tristrami_36188" "T_tristrami_6704"
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}
par(mfrow=c(5,1))
#plot each run
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

for (i in 6:10){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

Visualize ADMIXTURE results with MAC 5 filter
#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture.mac5/")
#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")
#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ylab("cross-validation error")+
xlab("K")+
scale_x_continuous(breaks = c(1:10))+
theme_classic()

#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture.mac5/")
#read in input file in order to get list of input samples in order
samps<-read.table("binary_fileset.fam")[,1]
#reorder sampling df to match order of the plot
sample.info<-sample.info[match(samps, sample.info$id),]
sample.info$id == samps
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}
par(mfrow=c(5,1))
#plot each run
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

for (i in 6:10){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

Visualize ADMIXTURE results with MAC 10 filter
#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture.mac10/")
#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")
#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ylab("cross-validation error")+
xlab("K")+
scale_x_continuous(breaks = c(1:10))+
theme_classic()

#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture.mac10/")
#read in input file in order to get list of input samples in order
samps<-read.table("binary_fileset.fam")[,1]
#reorder sampling df to match order of the plot
sample.info<-sample.info[match(samps, sample.info$id),]
sample.info$id == samps
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}
par(mfrow=c(5,1))
#plot each run
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

for (i in 6:10){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}

show separate run with no sanctus and MAC=2
#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture.nosanctus/")
#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")
#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ylab("cross-validation error")+
xlab("K")+
scale_x_continuous(breaks = c(1:10))+
theme_classic()

#read in input file in order to get list of input samples in order
samps<-read.table("binary_fileset.fam")[,1]
#reorder sampling df to match order of the plot
sample.info<-sample.info[match(samps, sample.info$id),]
sample.info$id == samps
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}
par(mfrow=c(1,1))
#plot each run
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}





for (i in 6:10){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}





so what we see is that removing sanctus completely solves this
problem, and even allows structure to separate albicilla and
saurophagus! So someone might say, this is just a hierarchical issue.
But the thing is, sanctus is not the outgroup! Leucopygius is the first
branching, and doesn’t affect the ability of any of the other groups to
cluster accurately, so this is fundamentally a separate issue from
hierarchical structure masking lower level structure. And it makes it
hard for empirical systems like this because there is not much
justification for removing a random ingroup.
#optimal K=7 shown here
knitr::include_graphics("/Users/devder/Desktop/todi.2022/admixture.nosanctus/labeled.barplot.png")

what if we just downsample sanctus to 10 samples and set MAC=2
#setwd to admixture directory run on the cluster
setwd("~/Desktop/todi.2022/admixture.downsamplesanctus/")
#read in log error values to determine optimal K
log<-read.table("log.errors.txt")[,c(3:4)]
#use double backslash to interpret the opening parentheses literally in the regular expression
log$V3<-gsub("\\(K=", "", log$V3)
log$V3<-gsub("):", "", log$V3)
#interpret K values as numerical
log$V3<-as.numeric(log$V3)
#rename columns
colnames(log)<-c("Kvalue","cross.validation.error")
#make plot showing the cross validation error across K values 1:10
ggplot(data=log, aes(x=Kvalue, y=cross.validation.error, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
ylab("cross-validation error")+
xlab("K")+
scale_x_continuous(breaks = c(1:10))+
theme_classic()

#read in input file in order to get list of input samples in order
samps<-read.table("binary_fileset.fam")[,1]
#reorder sampling df to match order of the plot
sample.info<-sample.info[match(samps, sample.info$id),]
sample.info$id == samps
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE NA NA NA NA NA
## [31] NA NA NA TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#read in all ten runs and save each dataframe in a list
runs<-list()
#read in log files
for (i in 1:10){
runs[[i]]<-read.table(paste0("binary_fileset.", i, ".Q"))
}
par(mfrow=c(1,1))
#plot each run
for (i in 1:5){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}





for (i in 6:10){
barplot(t(as.matrix(runs[[i]])), col=rainbow(i), ylab="Ancestry", border="black")
}





show plots labeled
#optimal k=6
knitr::include_graphics("/Users/devder/Desktop/todi.2022/admixture.downsamplesanctus/k6.labeled.png")

#here is k=9
knitr::include_graphics("/Users/devder/Desktop/todi.2022/admixture.downsamplesanctus/k9.labeled.png")

so downsampling sanctus certainly helps, but does not fix the
problem. ADMIXTURE still refuses to split albicilla and saurophagus in
the presence of sanctus, and starts adding spurious ancestry groups to
sanctus
I think this does solve our problem for this paper though, because
the k=6 graph above can be paired with a solo albicilla/saurophagus plot
to show the entire structure of the group.